#Part I
In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.
stid = student id year = year student watched video participation = whether or not the student opened the video watch.time = how long the student watched the video for confusion.points = how many times a student rewatched a section of a video key,points = how many times a student skipped or increased the speed of a video
#Install the 'tidyverse' package or if that does not work, install the 'dplyr' and 'tidyr' packages.
#Load the package(s) you just installed
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(dplyr)
D1 <- read.csv("video-data.csv", header = TRUE)
#Create a data frame that only contains the years 2018
D2 <- filter(D1, year == 2018)
#Generate a histogram of the watch time for the year 2018
hist(D2$watch.time)
#Change the number of breaks to 100, do you get the same impression?
hist(D2$watch.time, breaks = 100)
#Cut the y-axis off at 10
hist(D2$watch.time, breaks = 100, ylim = c(0,10))
#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35
hist(D2$watch.time, breaks = c(0,5,20,25,35))
#Plot the number of confusion points against the watch time
plot(D1$confusion.points, D1$watch.time)
#Create two variables x & y
x <- c(1,3,2,7,6,4,4)
y <- c(2,4,2,3,2,4,3)
#Create a table from x & y
table1 <- table(x,y)
#Display the table as a Barplot
barplot(table1)
#Create a data frame of the average total key points for each year and plot the two against each other as a lines
D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
## `summarise()` ungrouping output (override with `.groups` argument)
plot(D3$year, D3$mean_key, type = "l", lty = "dashed")
#Create a boxplot of total enrollment for three students
D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
#The drop levels command will remove all the schools from the variable with no data
D4 <- droplevels(D4)
boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")
## Pairs
#Use matrix notation to select columns 2, 5, 6, and 7
D5 <- D1[,c(2,5,6,7)]
#Draw a matrix of plots for every combination of variables
pairs(D5)
## Part II
#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 20
#filter() can be used to set a maximum and minimum value
#round() rounds numbers to whole number values
#sample() draws a random samples from the groups vector according to a uniform distribution
#simulate data
score <- rnorm(100, 75, 15)
hist(score,breaks=30)
S1 <-data.frame(score)
#replace the samples over 100 with 100
library(dplyr)
S1<-filter(S1,score<=100)
hist(S1$score)
S1
## score
## 1 80.87487
## 2 85.01307
## 3 88.43175
## 4 85.45668
## 5 62.00681
## 6 82.52423
## 7 78.15836
## 8 60.30607
## 9 91.58159
## 10 72.89160
## 11 95.22551
## 12 81.92356
## 13 85.42065
## 14 54.27170
## 15 62.60689
## 16 92.09013
## 17 75.18500
## 18 69.28725
## 19 71.33826
## 20 60.52950
## 21 67.73504
## 22 97.09658
## 23 84.62451
## 24 62.72625
## 25 77.73485
## 26 78.86311
## 27 70.83471
## 28 94.19442
## 29 97.80616
## 30 61.77682
## 31 90.20980
## 32 83.53310
## 33 81.62467
## 34 80.34869
## 35 74.04995
## 36 69.29995
## 37 92.54230
## 38 56.72008
## 39 85.40762
## 40 89.84091
## 41 62.66578
## 42 51.91075
## 43 76.73034
## 44 67.68292
## 45 70.69177
## 46 66.65772
## 47 54.52726
## 48 71.86613
## 49 51.99740
## 50 74.27187
## 51 67.59639
## 52 36.87490
## 53 70.11316
## 54 71.34340
## 55 82.24572
## 56 96.48479
## 57 54.51704
## 58 64.07890
## 59 58.50897
## 60 51.19677
## 61 79.92925
## 62 41.60717
## 63 69.61532
## 64 71.42719
## 65 76.39338
## 66 76.34355
## 67 86.97454
## 68 64.28295
## 69 87.46535
## 70 96.58134
## 71 72.91454
## 72 91.81606
## 73 72.25564
## 74 83.45995
## 75 53.88130
## 76 36.11727
## 77 76.89793
## 78 65.18067
## 79 93.68269
## 80 69.36480
## 81 66.06799
## 82 68.56357
## 83 40.61765
## 84 79.70890
## 85 89.67365
## 86 76.10254
## 87 60.10658
## 88 85.77869
## 89 72.40631
## 90 47.17927
## 91 58.39076
## 92 80.85854
## 93 77.88864
## 94 71.65038
## 95 71.31879
S2<- data.frame(rep(100,5)) # my sample has 5 scores above 100
names(S2)
## [1] "rep.100..5."
S3<- bind_rows(S1,S2)
S3
## score rep.100..5.
## 1 80.87487 NA
## 2 85.01307 NA
## 3 88.43175 NA
## 4 85.45668 NA
## 5 62.00681 NA
## 6 82.52423 NA
## 7 78.15836 NA
## 8 60.30607 NA
## 9 91.58159 NA
## 10 72.89160 NA
## 11 95.22551 NA
## 12 81.92356 NA
## 13 85.42065 NA
## 14 54.27170 NA
## 15 62.60689 NA
## 16 92.09013 NA
## 17 75.18500 NA
## 18 69.28725 NA
## 19 71.33826 NA
## 20 60.52950 NA
## 21 67.73504 NA
## 22 97.09658 NA
## 23 84.62451 NA
## 24 62.72625 NA
## 25 77.73485 NA
## 26 78.86311 NA
## 27 70.83471 NA
## 28 94.19442 NA
## 29 97.80616 NA
## 30 61.77682 NA
## 31 90.20980 NA
## 32 83.53310 NA
## 33 81.62467 NA
## 34 80.34869 NA
## 35 74.04995 NA
## 36 69.29995 NA
## 37 92.54230 NA
## 38 56.72008 NA
## 39 85.40762 NA
## 40 89.84091 NA
## 41 62.66578 NA
## 42 51.91075 NA
## 43 76.73034 NA
## 44 67.68292 NA
## 45 70.69177 NA
## 46 66.65772 NA
## 47 54.52726 NA
## 48 71.86613 NA
## 49 51.99740 NA
## 50 74.27187 NA
## 51 67.59639 NA
## 52 36.87490 NA
## 53 70.11316 NA
## 54 71.34340 NA
## 55 82.24572 NA
## 56 96.48479 NA
## 57 54.51704 NA
## 58 64.07890 NA
## 59 58.50897 NA
## 60 51.19677 NA
## 61 79.92925 NA
## 62 41.60717 NA
## 63 69.61532 NA
## 64 71.42719 NA
## 65 76.39338 NA
## 66 76.34355 NA
## 67 86.97454 NA
## 68 64.28295 NA
## 69 87.46535 NA
## 70 96.58134 NA
## 71 72.91454 NA
## 72 91.81606 NA
## 73 72.25564 NA
## 74 83.45995 NA
## 75 53.88130 NA
## 76 36.11727 NA
## 77 76.89793 NA
## 78 65.18067 NA
## 79 93.68269 NA
## 80 69.36480 NA
## 81 66.06799 NA
## 82 68.56357 NA
## 83 40.61765 NA
## 84 79.70890 NA
## 85 89.67365 NA
## 86 76.10254 NA
## 87 60.10658 NA
## 88 85.77869 NA
## 89 72.40631 NA
## 90 47.17927 NA
## 91 58.39076 NA
## 92 80.85854 NA
## 93 77.88864 NA
## 94 71.65038 NA
## 95 71.31879 NA
## 96 NA 100
## 97 NA 100
## 98 NA 100
## 99 NA 100
## 100 NA 100
#create the interest group variable in S3 data frame
interest <- c("sport","music","nature","literature")
S3$interest <- sample(interest, 100, replace = TRUE)
S3
## score rep.100..5. interest
## 1 80.87487 NA nature
## 2 85.01307 NA nature
## 3 88.43175 NA music
## 4 85.45668 NA sport
## 5 62.00681 NA literature
## 6 82.52423 NA music
## 7 78.15836 NA nature
## 8 60.30607 NA music
## 9 91.58159 NA music
## 10 72.89160 NA literature
## 11 95.22551 NA nature
## 12 81.92356 NA literature
## 13 85.42065 NA music
## 14 54.27170 NA nature
## 15 62.60689 NA music
## 16 92.09013 NA nature
## 17 75.18500 NA music
## 18 69.28725 NA nature
## 19 71.33826 NA literature
## 20 60.52950 NA nature
## 21 67.73504 NA music
## 22 97.09658 NA music
## 23 84.62451 NA nature
## 24 62.72625 NA sport
## 25 77.73485 NA literature
## 26 78.86311 NA sport
## 27 70.83471 NA nature
## 28 94.19442 NA literature
## 29 97.80616 NA literature
## 30 61.77682 NA music
## 31 90.20980 NA nature
## 32 83.53310 NA sport
## 33 81.62467 NA music
## 34 80.34869 NA sport
## 35 74.04995 NA literature
## 36 69.29995 NA music
## 37 92.54230 NA music
## 38 56.72008 NA nature
## 39 85.40762 NA music
## 40 89.84091 NA nature
## 41 62.66578 NA music
## 42 51.91075 NA nature
## 43 76.73034 NA music
## 44 67.68292 NA nature
## 45 70.69177 NA music
## 46 66.65772 NA literature
## 47 54.52726 NA nature
## 48 71.86613 NA music
## 49 51.99740 NA nature
## 50 74.27187 NA nature
## 51 67.59639 NA sport
## 52 36.87490 NA literature
## 53 70.11316 NA music
## 54 71.34340 NA music
## 55 82.24572 NA sport
## 56 96.48479 NA sport
## 57 54.51704 NA sport
## 58 64.07890 NA nature
## 59 58.50897 NA literature
## 60 51.19677 NA literature
## 61 79.92925 NA music
## 62 41.60717 NA sport
## 63 69.61532 NA sport
## 64 71.42719 NA sport
## 65 76.39338 NA music
## 66 76.34355 NA sport
## 67 86.97454 NA sport
## 68 64.28295 NA sport
## 69 87.46535 NA music
## 70 96.58134 NA nature
## 71 72.91454 NA nature
## 72 91.81606 NA sport
## 73 72.25564 NA literature
## 74 83.45995 NA literature
## 75 53.88130 NA sport
## 76 36.11727 NA music
## 77 76.89793 NA sport
## 78 65.18067 NA nature
## 79 93.68269 NA music
## 80 69.36480 NA literature
## 81 66.06799 NA sport
## 82 68.56357 NA sport
## 83 40.61765 NA sport
## 84 79.70890 NA nature
## 85 89.67365 NA nature
## 86 76.10254 NA music
## 87 60.10658 NA literature
## 88 85.77869 NA music
## 89 72.40631 NA nature
## 90 47.17927 NA nature
## 91 58.39076 NA music
## 92 80.85854 NA music
## 93 77.88864 NA literature
## 94 71.65038 NA literature
## 95 71.31879 NA sport
## 96 NA 100 music
## 97 NA 100 nature
## 98 NA 100 sport
## 99 NA 100 nature
## 100 NA 100 sport
hist(S3$score,breaks=30)
#cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet.
label <-letters[1:10]
S3$breaks <-cut(S3$score,breaks=10,labels=label)
library(RColorBrewer)
#Let's look at the available palettes in RColorBrewer
#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
#Make RColorBrewer palette available to R and assign to your bins
#Use named palette in histogram
S3$colors <- brewer.pal(10, "Set3")
hist(S3$score,col=S3$colors)
#Make a vector of the colors from RColorBrewer
interest.col <-brewer.pal(4,"Dark2")
boxplot(score ~ interest, S3,col=interest.col)
S3$login <- sample(1:25, 100, replace = TRUE)
plot(S3$login, S3$score, col=S3$colors, main="Student Logins vs. Scores")
S3$col1 <- ifelse(S3$interest == "music","red","green")
AP<- data.frame(AirPassengers)
plot(AirPassengers)
IR<- data.frame(iris)
pairs(IR)
The pairs function creates a set of plots for every combination of variables in the dataset, and we can see there are some plots demonstrate the patterns of correlation for some variables. We can further investigate the correlations between the following variable pairs:Sepal.Length by Sepal. Width; Sepal.Length by Pental. Length; Sepal.Length by Pental. Width; Sepal Width by Petal Length; Sepal.Width by Petal width; Petal. Length by Petal. Width.All of these pairs seem to have some sort of correlations.But if only one correlation we need to choose to further investigate, Petal Length by Pental Width seems the most appropriate one to correlate on.
In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.
swirl-data.csv file called DF1The variables are:
course_name - the name of the R course the student attempted
lesson_name - the lesson name
question_number - the question number attempted correct - whether the question was answered correctly
attempt - how many times the student attempted the question
skipped - whether the student skipped the question
datetime - the date and time the student attempted the question
hash - anonymyzed student ID
DF1 <- read.csv("swirl-data.csv",header = TRUE)
View(DF1)
hash, lesson_name and attempt called DF2DF2 <- DF1[,c(2,5,8)]
View(DF2)
group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3DF3 <- DF2 %>% group_by(hash, lesson_name)%>% summarise(sum_attempts = sum(attempt))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
View(DF3)
On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names
Convert DF3 to this format
DF3_wide <- spread(DF3,lesson_name,sum_attempts)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
View(DF3_wide)
DF1 called DF4 that only includes the variables hash, lesson_name and correctDF4 <- DF1[,c(2,4,8)]
View(DF4)
correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0DF4$correct<- ifelse(DF4$correct=="TRUE",1,0)
View(DF4)
DF5 that provides a mean score for each student on each courseDF5 <- DF1[,c(1,4,8)] #create a new data frame with the variable "course_name","hash" and "correct"
DF5$correct<- ifelse(DF5$correct=="TRUE",1,0)
DF5$correct[is.na(DF5$correct)]<-0 #replace all the NA with 0
DF5 <- DF5%>% group_by(hash,course_name)%>%summarise(mean_score = mean(correct))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
View(DF5)
datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each dayDF6 <- DF1[,c(4,7)]
library(anytime)
DF6$datetime <- anytime(DF6$datetime)
DF6$datetime <- format(as.Date(DF6$datetime), "%m-%d-%Y")
DF6$correct<- ifelse(DF6$correct=="TRUE",1,0)
DF6$correct[is.na(DF6$correct)]<-0 #replace all the NA with 0
DF6 <- DF6%>% group_by(datetime)%>%summarise(mean_correct = mean(correct))
## `summarise()` ungrouping output (override with `.groups` argument)
View(DF6)
Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file.